Loading and cleaning the data.

airplane_df = read_csv("datasets/airplane_crashes.csv") |> 
  janitor::clean_names() |> 
  filter(ground != "NULL", aboard != "NULL") |> 

  # removes an unnecessary column
  select(-flight_number, -fatalities_passangers, -fatalities_crew, -aboard_passangers, -aboard_crew) |> 
  drop_na(date, time, operator, route, aboard, fatalities, registration, cn_ln, ground, summary)       
## Rows: 4967 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Date, Time, Location, Operator, Flight #, Route, AC Type, Registra...
## dbl  (1): Fatalities
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Creating a datetime column.

airplane_df = airplane_df |> 
  mutate(
    # remove leading/trailing spaces
    time = str_trim(time),
    
    # replace the invalid times with NA
    time = ifelse(time %in% c("91:5", "90:0"), NA, time),
    
    # combine the cleaned date and time columns into datetime
    datetime = mdy_hm(paste(date, time))
  ) |> 
  
  # remove any rows that could not be parsed
  drop_na(datetime)

Converting variables to their proper variable types.

airplane_df = airplane_df |> 
  mutate(
    year       = year(datetime), 
    month      = month(datetime), 
    month_name = month(datetime, label = TRUE),
    
    aboard     = as.numeric(aboard),
    fatalities =  as.numeric(ground),
    operator   = as.factor(operator)  # to group by operator
    ) |> 
  select(-date, -time)

Creating a decade column, now that year is numeric.

airplane_df = airplane_df |> 
  mutate(
    decade = floor(year / 10) * 10, 
    decade = paste0(decade, "s")
  ) |> 
  select(datetime, year, decade, month, month_name, everything()) 

Exploring the data.

Crashes per year?

# ggplot

airplane_df |> 
  group_by(year) |> 
  summarize(total_crashes = n()) |> 
  ggplot(aes(x = year, y = total_crashes)) + 
  geom_line() +
  geom_point() + 
  geom_smooth(span = 0.2, color = "red", se = FALSE) +
  labs(
    title = "Airplane Crashes per Year",
       x  = "Year",
       y  = "Number of Crashes"
    )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# interactive plot with plotly

airplane_df |> 
  count(year) |> 
  plot_ly(
    x = ~year, 
    y = ~n,
    type = "scatter",
    mode = "lines+markers",
    hovertemplate = "Year: %{x}<br>Crashes: %{y}<extra></extra>"
  ) |> 
  layout(
    title = list(
      text = "Airplane Crashes per Year",
      font = list(size = 20)
    ),
    xaxis = list(
      title = list(
        text = "Year",
        font = list(size = 16)
      )
      ),
    yaxis = list(
      title = list(
        text = "Number of Crashes",
        font = list(size = 16)
      )
    )
  )

Seasonal trends, combining all years?

airplane_df |> 
  group_by(month_name) |> 
  summarize(total_crashes = n()) |> 
  ggplot(aes(x = month_name, y = total_crashes)) +
  geom_col(fill = "blue") +
  labs(title = "Airplane Crashes by Month")

Top airlines with crashes?

airplane_df |> 
  group_by(operator) |> 
  summarise(total_crashes = n(), .groups = "drop") |> 
  slice_max(total_crashes, n = 15) |>  # select the top 15 operators
  ggplot(aes(x = reorder(operator, total_crashes), y = total_crashes )) +
  geom_col(fill = "blue") +
  coord_flip() +
  labs(
    title = "Top 15 Airline Operators with the Most Crashes",
    x     = "Airline Operators",
    y     = "Number of Crashes"
  )  + 
  theme(
    axis.title.y = element_text(margin = margin(r = 20))
  )

Avg fatalities per year?

airplane_df |> 
  group_by(year) |> 
  summarise(avg_fatalities = mean(fatalities, na.rm = TRUE)) |> 
  ggplot(aes(x = year, y = avg_fatalities)) +
  geom_line() +
  geom_point() +
  labs(title = "Average Fatalities per Crash per Year")

Looking at seasonal and yearly tends more closely.

# heatmap of crashes by year and month
airplane_df |> 
  count(decade, month_name) |>  # counts crashes in the decade/month
  ggplot(aes(x = decade, y = month_name, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Heatmap of Airplane Crashes by Year and Month",
       x = "Deacde",
       y = "Month",
       fill = "Number of Crashes")